import seaborn as sns
import pandas as pd
import numpy as np
incsv contains protein sequence ID and whether they are in CE0 or PL9.
infile contains the output of DIAMOND Fast. Caculation for BSR is done.
incsv = f"../Data/CE0_PL9/CE0_PL9_accessions.csv"
# read in CE0 and PL9 accessions/ sequence names.
dffam = pd.read_csv(incsv) # read csv into a pandas dataframe.
infile = f"../Results/CE0_PL9/CE0_PL9rundiamondfast.tsv" # Set diamond input.
df = pd.read_table(infile, header=None) # Create a dataframe of Diamond input.
df.columns = ["qseqid", "sseqid", "qlen", "qstart", "qend", "pident", "bitscore"]
# Name the columnes in the dataframe.
df["normalisedbitscore"]=df["bitscore"]/df["qlen"]
# Caculate the normalised bitscore.
df["qcov"] = (df["qend"]-df["qstart"])/df["qlen"] # Calculate query coverage.
The culstermap is plotted with a large figure size to ensure all the xtick and ytick lables can be read.
widedfx = pd.pivot(
df,
index="qseqid",
columns= "sseqid",
values="normalisedbitscore"
) # Turn into a wide dataframe.
widedf = widedfx.fillna(0) # Remove any values NaN and replace with 0.
famqseqid = []
#create empty lists for the families of the query and subject sequence IDs.
famsseqid = []
for qseqid in widedf: # going through the row names in df.
fil = dffam[dffam['genbank_accession'] == qseqid] # filter through accessions
if len(fil) > 1:
family = 'CE0+PL9' # identify and annotate multi-domain CAZymes
else:
family = fil.iloc[0]['family']
famqseqid.append(family)
for sseqid in widedf.columns: # same as before but for columns not rows.
fil = dffam[dffam['genbank_accession'] == sseqid]
if len(fil) > 1:
family = 'CE0+PL9'
else:
family = fil.iloc[0]['family']
famsseqid.append(family)
widedf['Family'] = famqseqid # Add family annotations to BSR dataframe
pd_series = widedf.pop('Family')
lut = dict(zip(set(famqseqid), sns.color_palette("Set1", n_colors=len(set(famqseqid)))))
row_colours = pd_series.map(lut)
def plotdiamond(fast): # Function is called plotdiamond.
"""Function takes the output matrix calculated by diamond and plots a clustermap with
the argument as DIAMOND sensitvity, eg) 'Fast'. """
sns.set(font_scale=0.5)
figure=sns.clustermap(# Plot clustermap of the data frame created.
widedf,
cmap="Blues",
figsize=(450, 450),
row_colors=row_colours,
col_colors=row_colours,
yticklabels=1,
xticklabels=1
);
for label in list(lut.keys()):
figure.ax_row_dendrogram.bar(0, 0, color=lut[label], label=label, linewidth=0)
l3 = figure.ax_row_dendrogram.legend(
title='Legend',
loc='upper right',
ncol=1,
fontsize=400
)
figure.ax_heatmap.set_xlabel("Subject Sequence ID",fontsize=300, labelpad=15)
figure.ax_heatmap.set_ylabel("Query Sequence ID",fontsize=300, labelpad=15)
figure.ax_heatmap.set_title(
f'CE0 and PL9 CAZymes Normalised Bitscore Calculated by Diamond Fast',
fontsize=400,
pad=80
)
figure.savefig(f'../Results/CE0_PL9/CE0_PL9_BSR_hugeREAD.png')
# Save in results folder.
return figure
plotBSR = plotdiamond('fast') # Run function for 100 sequences at fast
help(plotdiamond) # Call functions doc string to explain what the function does